#' ---
#' title: "Replication script for Geographical representation under a single nationwide district: the case of the Netherlands"
#' author: "Marijn Nagtzaam and Tom Louwerse"
#' ---


# Load packages -----------------------------------------------------------
library(lme4)
library(effects)
library(ggplot2)
library(plyr)
library(DescTools)
library(ggpubr)
library(mokken)
library(tidyverse)


# Load data ---------------------------------------------------------------
Dataset <- readRDS("Dataset.rds")

Dataset$regional_ties1 <- factor(Dataset$regional_ties, labels = c("No ties", "1", "2", "3", "Four ties"))
Dataset$regional_ties2 <- factor(Dataset$regional_ties, labels = c("No ties", "1", "2", "3 & 4", "3 & 4"))

Dataset$place_of_residence <- factor(Dataset$place_of_residence, levels = c(0,1), labels = c("Not living in province", "Living in province"))
Dataset$education1 <- factor(Dataset$education1, levels = c(0,1), labels = c("No education in province", "Education in province"))
Dataset$education2 <- factor(Dataset$education2, levels = c(0,1), labels = c("No education in province", "Education in province"))
Dataset$place_of_birth <- factor(Dataset$place_of_birth, levels = c(0,1), labels = c("Not born in province", "Born in province"))

#Subset dataset speeches
DATASET.SPEECHES <- Dataset
DATASET.SPEECHES <- DATASET.SPEECHES[DATASET.SPEECHES$speeches_total > 100 & DATASET.SPEECHES$joined_cabinet == 0,]
DATASET.SPEECHES <- DATASET.SPEECHES[apply(DATASET.SPEECHES, 1, function(y) !all(is.na(y))),]

#Subset dataset questions
DATASET.QUESTIONS <- Dataset
DATASET.QUESTIONS <- DATASET.QUESTIONS[DATASET.QUESTIONS$questions_total > 10 & DATASET.QUESTIONS$joined_cabinet == 0, ]
DATASET.QUESTIONS <- DATASET.QUESTIONS[apply(DATASET.QUESTIONS, 1, function(y) !all(is.na(y))),]


# Descriptive data -----------------------------------------

# Regional ties (table 2)
regional_ties_speeches.freq <- Freq(DATASET.SPEECHES$regional_ties1)
regional_ties_questions.freq <- Freq(DATASET.QUESTIONS$regional_ties1)

# Average regional speeches and questions by the number of regional ties
data.frame(aggregate(DATASET.SPEECHES$speeches_relative, by = list(DATASET.SPEECHES$regional_ties2), mean))
data.frame(aggregate(DATASET.QUESTIONS$questions_relative, by = list(DATASET.QUESTIONS$regional_ties2), mean))

# Figure 1
figure1_A <- ggplot(DATASET.SPEECHES, aes(x=place_of_residence, y=speeches_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Place of residence", y = "Share of regional speeches")+
  scale_y_continuous(limits=c(0,0.1)) +
  theme_minimal()
plot(figure1_A)

figure1_B <- ggplot(DATASET.QUESTIONS, aes(x=place_of_residence, y=questions_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Place of residence", y = "Share of regional questions")+
  scale_y_continuous(limits=c(0,0.2)) +
  theme_minimal()
plot(figure1_B)

# Figure 2
figure2_A <- ggplot(DATASET.SPEECHES, aes(x=regional_ties2, y=speeches_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Regional ties", y = "Share of regional speeches") +
  scale_y_continuous(limits=c(0,0.075)) +
  theme_minimal()
plot(figure2_A)

figure2_B <- ggplot(DATASET.QUESTIONS, aes(x=regional_ties2, y=questions_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Regional ties", y = "Share of regional questions")+
  scale_y_continuous(limits=c(0,0.25)) +
  theme_minimal()
plot(figure2_B)

#Appendix A - Speeches
table_A1 <- join_all(list(ddply(DATASET.SPEECHES, "province", summarise, Mean.A=mean(speeches_province)),
                         ddply(DATASET.SPEECHES, "province", summarise, SD.A=sd(speeches_province)),
                         ddply(DATASET.SPEECHES, "province", summarise, MAX.A=max(speeches_province)),
                         ddply(DATASET.SPEECHES, "province", summarise, Mean.R=mean(speeches_relative)),
                         ddply(DATASET.SPEECHES, "province", summarise, SD.R=sd(speeches_relative)),
                         ddply(DATASET.SPEECHES, "province", summarise, MAX.R=max(speeches_relative))),
                     by = "province")

figure_A1 <- ggplot(DATASET.SPEECHES, aes(x=speeches_relative))+
             geom_histogram(color="black", fill="white")+
             facet_wrap(province ~ ., ncol=3, scales = "free") +
             geom_vline(data=table_A1, aes(xintercept=Mean.R, color="red"), linetype="dashed") +
             labs(x = "Share of regional speeches") +
             theme(legend.position = "none")

#Appendix A - Questions
table_A2 <- join_all(list(ddply(DATASET.QUESTIONS, "province", summarise, Mean.A=mean(questions_province)),
                         ddply(DATASET.QUESTIONS, "province", summarise, SD.A=sd(questions_province)),
                         ddply(DATASET.QUESTIONS, "province", summarise, MAX.A=max(questions_province)),
                         ddply(DATASET.QUESTIONS, "province", summarise, Mean.R=mean(questions_relative)),
                         ddply(DATASET.QUESTIONS, "province", summarise, SD.R=sd(questions_relative)),
                         ddply(DATASET.QUESTIONS, "province", summarise, MAX.R=max(questions_relative))),
                     by = "province")
plot(figure_A1)

figure_A2 <- ggplot(DATASET.QUESTIONS, aes(x=questions_relative))+
             geom_histogram(color="black", fill="white")+
             facet_wrap(province ~ ., ncol=3, scales = "free") +
             geom_vline(data=table_A2, aes(xintercept=Mean.R, color="red"), linetype="dashed") +
             labs(x = "Share of regional questions") +
             theme(legend.position = "none")
plot(figure_A2)

#Appendix B - Speeches
figureB1_1 <- ggplot(DATASET.SPEECHES, aes(x=place_of_birth, y=speeches_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Place of birth", y = "Share of regional speeches")+
  scale_y_continuous(limits=c(0,0.1)) +
  theme_minimal()

figureB1_2 <- ggplot(DATASET.SPEECHES, aes(x=place_of_residence, y=speeches_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Place of residence", y = "")+
  scale_y_continuous(limits=c(0,0.1)) +
  theme_minimal()

figureB1_3 <- ggplot(DATASET.SPEECHES, aes(x=education1, y=speeches_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Primary or secondary education", y = "Share of regional speeches")+
  scale_y_continuous(limits=c(0,0.1)) +
  theme_minimal()

figureB1_4 <- ggplot(DATASET.SPEECHES, aes(x=education2, y=speeches_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Tertiary education", y = "")+
  scale_y_continuous(limits=c(0,0.1)) +
  theme_minimal()

figureB1 <- ggarrange(figureB1_1, figureB1_2, figureB1_3, figureB1_4, 
                     ncol = 2, nrow = 2)

#Appendix B - Questions
figureB2_1 <- ggplot(DATASET.QUESTIONS, aes(x=place_of_birth, y=questions_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Place of birth", y = "Share of regional questions")+
  scale_y_continuous(limits=c(0,0.2)) +
  theme_minimal()

figureB2_2 <- ggplot(DATASET.QUESTIONS, aes(x=place_of_residence, y=questions_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Place of residence", y = "")+
  scale_y_continuous(limits=c(0,0.2)) +
  theme_minimal()

figureB2_3 <- ggplot(DATASET.QUESTIONS, aes(x=education1, y=questions_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Primary or secondary education", y = "Share of regional questions")+
  scale_y_continuous(limits=c(0,0.2)) +
  theme_minimal()

figureB2_4 <- ggplot(DATASET.QUESTIONS, aes(x=education2, y=questions_relative)) + 
  geom_boxplot(fill="gray")+
  labs(x="Tertiary education", y = "")+
  scale_y_continuous(limits=c(0,0.2)) +
  theme_minimal()

figureB2 <- ggarrange(figureB2_1, figureB2_2, figureB2_3, figureB2_4, 
                     ncol = 2, nrow = 2)

#Appendix C - Speeches
figureC1 <- ggplot(DATASET.SPEECHES, aes(x=regional_ties2, y=speeches_relative, fill=province)) + 
  geom_boxplot(fill="gray")+
  labs(x="Regional ties", y = "Share of regional speeches")+
  facet_wrap(~province, scales = "free_y") +
  theme_minimal()
plot(figureC1)

#Appendix C - Questions
figureC2 <- ggplot(DATASET.QUESTIONS, aes(x=regional_ties2, y=questions_relative, fill=province)) + 
  geom_boxplot(fill="gray")+
  labs(x="Regional ties", y = "Share of regional questions")+
  facet_wrap(~province, scales = "free_y") +
  theme_minimal()
plot(figureC2)


# Mokken scaling analysis -------------------------------------------------
dta_mokken <- Dataset %>%
  select(place_of_birth, place_of_residence, education1, education2) %>%
  mutate(across(.fns = as.numeric)) %>%
  mutate(across(.fns = function(x) x - 1))


coefH(dta_mokken)


# Regression models -------------------------------------------------

#Exclude MP ID = 32366 from speeches
DATASET.SPEECHES <- DATASET.SPEECHES[DATASET.SPEECHES$id != "32366",]

# Speeches - Model 1 (Regional ties index, table 4)
MODEL_SPEECHES_1 <- lmer(speeches_relative ~
                         regional_ties2 +
                         coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                         regional_ties2:votes +
                         regional_ties2:regional_attachment +
                         votes:regional_attachment +
                         (1 | id) + (1 | province), data=DATASET.SPEECHES, REML=FALSE)
summary(MODEL_SPEECHES_1)

# Speeches - Model 2 (Seperate indicators for regional ties, table 4)
MODEL_SPEECHES_2 <- lmer(speeches_relative ~
                         place_of_birth + place_of_residence + education1 + education2 +
                         coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                         place_of_birth:votes + place_of_residence:votes + education1:votes + education2:votes +
                         place_of_birth:regional_attachment + place_of_residence:regional_attachment + education1:regional_attachment + education2:regional_attachment +
                         votes:regional_attachment +
                         (1 | id) + (1 | province), data=DATASET.SPEECHES, REML=FALSE)
summary(MODEL_SPEECHES_2)

# Questions - Model 1 (Regional ties index, table 5)
MODEL_QUESTIONS_1 <- lmer(questions_relative ~
                         regional_ties2 +
                         coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                         regional_ties2:votes +
                         regional_ties2:regional_attachment +
                         votes:regional_attachment +
                         (1 | id) + (1 | province), data=DATASET.QUESTIONS, REML=FALSE)
summary(MODEL_QUESTIONS_1)

# Questions - Model 2 (Seperate indicators for regional ties, table 5)
MODEL_QUESTIONS_2 <- lmer(questions_relative ~
                         place_of_birth + place_of_residence + education1 + education2 +
                         coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                         place_of_birth:votes + place_of_residence:votes + education1:votes + education2:votes +
                         place_of_birth:regional_attachment + place_of_residence:regional_attachment + education1:regional_attachment + education2:regional_attachment +
                         votes:regional_attachment +
                         (1 | id) + (1 | province), data=DATASET.QUESTIONS, REML=FALSE)
summary(MODEL_QUESTIONS_2)

# Effect plots -------------------------------------------------

#Interaction effects with Regional electoral performance (Figure 3)
figure3A <- plot(effect("regional_ties2:votes", MODEL_SPEECHES_1),
                  ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                  xlab = "Regional electoral performance", ylab = "Regional speeches",
                  main = "Regional ties (Speeches)", factor.names = FALSE)

figure3B2 <- plot(effect("place_of_residence:votes", MODEL_SPEECHES_2),
                  ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                  xlab = "Regional electoral performance", ylab = "Regional speeches",
                  main = "Place of residence (Speeches)", factor.names = FALSE)

figure3C <- plot(effect("regional_ties2:votes", MODEL_QUESTIONS_1),
                 ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                 xlab = "Regional electoral performance", ylab = "Regional questions",
                 main = "Regional ties (Questions)", factor.names = FALSE)

figure3 <- ggarrange(figure3A, figure3B2, figure3C,
                      labels = c("A", "B", "C"),
                      ncol = 2, nrow = 2, widths = c(2, 1))
plot(figure3)

#Interaction effects with Regional attachment (Figure 4)
figure4A <- plot(effect("regional_ties2:regional_attachment", MODEL_SPEECHES_1),
                 ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                 xlab = "Regional attachment", ylab = "Regional speeches",
                 main = "Regional ties (Speeches)", factor.names = FALSE)

figure4B2 <- plot(effect("place_of_residence:regional_attachment", MODEL_SPEECHES_2),
                  ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                  xlab = "Regional attachment", ylab = "Regional speeches",
                  main = "Place of residence (Speeches)", factor.names = FALSE)

figure4C <- plot(effect("regional_ties2:regional_attachment", MODEL_QUESTIONS_1),
                 ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                 xlab = "Regional attachment", ylab = "Regional questions",
                 main = "Regional ties (Questions)", factor.names = FALSE)

figure4D2 <- plot(effect("place_of_residence:regional_attachment", MODEL_QUESTIONS_2),
                  ci.style="lines", rescale.axis=FALSE, rug=TRUE, multiline=FALSE,
                  xlab = "Regional attachment", ylab = "Regional questions",
                  main = "Place of residence (Questions)", factor.names = FALSE)

figure4 <- ggarrange(figure4A, figure4B2, figure4C, figure4D2,
                     labels = c("A", "B", "C", "D"),
                     ncol = 2, nrow = 2, widths = c(2, 1))
plot(figure4)


# Additional models appendix -------------------------------------------------

# Models Speeches (with three-way interaction)
MODEL_SPEECHES_APPENDIX_1 <- lmer(speeches_relative ~
                               regional_ties2 +
                               coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                               regional_ties2:votes +
                               regional_ties2:regional_attachment +
                               votes:regional_attachment +
                               regional_ties2:votes:regional_attachment +
                               (1 | id) + (1 | province), data=DATASET.SPEECHES, REML=FALSE)
summary(MODEL_SPEECHES_APPENDIX_1)

MODEL_SPEECHES_APPENDIX_2 <- lmer(speeches_relative ~
                               place_of_birth + place_of_residence + education1 + education2 +
                               coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                               place_of_birth:votes + place_of_residence:votes + education1:votes + education2:votes +
                               place_of_birth:regional_attachment + place_of_residence:regional_attachment + education1:regional_attachment + education2:regional_attachment +
                               votes:regional_attachment +
                               place_of_birth:votes:regional_attachment + place_of_residence:votes:regional_attachment + education1:votes:regional_attachment + education2:votes:regional_attachment +
                               (1 | id) + (1 | province), data=DATASET.SPEECHES, REML=FALSE)
summary(MODEL_SPEECHES_APPENDIX_2)

# Models Questions (with three-way interaction)
MODEL_QUESTIONS_APPENDIX_1 <- lmer(questions_relative ~
                             regional_ties2 +
                             coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                             regional_ties2:votes +
                             regional_ties2:regional_attachment +
                             regional_ties2:votes:regional_attachment +
                             (1 | id) + (1 | province), data=DATASET.QUESTIONS, REML=FALSE)
summary(MODEL_QUESTIONS_APPENDIX_1)

MODEL_QUESTIONS_APPENDIX_2 <- lmer(questions_relative ~
                                     place_of_birth + place_of_residence + education1 + education2 +
                                     coalition + seats + regional_attachment + party_group_leader + votes + province_size +
                                     place_of_birth:votes + place_of_residence:votes + education1:votes + education2:votes +
                                     place_of_birth:regional_attachment + place_of_residence:regional_attachment + education1:regional_attachment + education2:regional_attachment +
                                     votes:regional_attachment +
                                     place_of_birth:votes:regional_attachment + place_of_residence:votes:regional_attachment + education1:votes:regional_attachment + education2:votes:regional_attachment +
                                     (1 | id) + (1 | province), data=DATASET.QUESTIONS, REML=FALSE)
summary(MODEL_QUESTIONS_APPENDIX_2)


# Additional models with distance -------------------------------------------------

# MODEL_AFSTAND Speeches A (Binding index)
MODEL_AFSTAND_SPEECHES_A <- lmer(speeches_relative ~
                           regional_ties2 +
                           coalition + seats + regional_attachment + party_group_leader + votes + province_size + distance_TH +
                           regional_ties2:votes +
                           regional_ties2:regional_attachment +
                           votes:regional_attachment +
                           (1 | id) + (1 | province), data=DATASET.SPEECHES, REML=FALSE)
summary(MODEL_AFSTAND_SPEECHES_A)

# MODEL_AFSTAND speeches B (Losse indicatoren)
MODEL_AFSTAND_SPEECHES_B <- lmer(speeches_relative ~
                           place_of_birth + place_of_residence + education1 + education2 +
                           coalition + seats + regional_attachment + party_group_leader + votes + province_size +  distance_TH +
                           place_of_birth:votes + place_of_residence:votes + education1:votes + education2:votes +
                           place_of_birth:regional_attachment + place_of_residence:regional_attachment + education1:regional_attachment + education2:regional_attachment +
                           votes:regional_attachment +
                           (1 | id) + (1 | province), data=DATASET.SPEECHES, REML=FALSE)
summary(MODEL_AFSTAND_SPEECHES_B)

# MODEL_AFSTAND Questions A (Binding index)
MODEL_AFSTAND_QUESTIONS_A <- lmer(questions_relative ~
                         regional_ties2 +
                         coalition + seats + regional_attachment + party_group_leader + votes + province_size +  distance_TH +
                         regional_ties2:votes +
                         regional_ties2:regional_attachment +
                         votes:regional_attachment +
                         (1 | id) + (1 | province), data=DATASET.QUESTIONS, REML=FALSE)
summary(MODEL_AFSTAND_QUESTIONS_A)

# MODEL_AFSTAND Questions B (Losse indicatoren)
MODEL_AFSTAND_QUESTIONS_B <- lmer(questions_relative ~
                         place_of_birth + place_of_residence + education1 + education2 +
                         coalition + seats + regional_attachment + party_group_leader + votes + province_size +  distance_TH +
                         place_of_birth:votes + place_of_residence:votes + education1:votes + education2:votes +
                         place_of_birth:regional_attachment + place_of_residence:regional_attachment + education1:regional_attachment + education2:regional_attachment +
                         votes:regional_attachment +
                         (1 | id) + (1 | province), data=DATASET.QUESTIONS, REML=FALSE)
summary(MODEL_AFSTAND_QUESTIONS_B)


# Descriptive data for appendix F (Party differences) -------------------------------------------------
party_N_Speeches <- count(DATASET.SPEECHES, party) 
party_N_Speeches$Kamerleden <- party_N_Speeches$n / 12

party_N_Questions <- count(DATASET.QUESTIONS, party) 
party_N_Questions$Kamerleden <- party_N_Questions$n / 12


#Speeches
A <- as.data.frame(with(DATASET.SPEECHES, tapply(speeches_relative, list(party, place_of_residence), mean)))
B <- as.data.frame(with(DATASET.SPEECHES, tapply(speeches_relative, list(party, place_of_residence), sd)))

A$party <- rownames(A)
B$party <- rownames(B)

table_F1 <- join_all(list(party_N_Speeches, A,B), by = "party")
rm(A,B)


#Questions
A <- as.data.frame(with(DATASET.QUESTIONS, tapply(questions_relative, list(party, place_of_residence), mean)))
B <- as.data.frame(with(DATASET.QUESTIONS, tapply(questions_relative, list(party, place_of_residence), sd)))

A$party <- rownames(A)
B$party <- rownames(B)

table_F2 <- join_all(list(party_N_Questions, A,B), by = "party")
rm(A,B)